base_url <- # long URL, so use paste() to avoid ugly long lines
paste0('https://raw.githubusercontent.com/',
'CSSEGISandData/COVID-19/master/',
'csse_covid_19_data/csse_covid_19_time_series/')
# add the file name from that folder so we can create the full path
filename <-
paste0('time_series_covid19_', c('confirmed_global', 'deaths_global', 'recovered_global'), '.csv')
# paste everything together to have the complete URL
url <- paste0(base_url, filename)
url
## [1] "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
## [2] "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
## [3] "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
# reading in the first vector (link to confirmed csv) as a dataframe
confirmed <- url[1] %>%
read_csv(col_types = cols(
.default = col_double(),
`Province/State`= col_character(),
`Country/Region` = col_character()
))%>%
rename(country_or_region = `Country/Region`, province_or_state = `Province/State`) # renaming this way apparently avoids a bug in ggformula
# using tidyr, let's group our data into a long format rather than a wide format, by having a date and count column respectively
confirmed_long <- confirmed%>%pivot_longer(
-(1:4), # do not do anything to the first four columns of the dataset
names_to = 'date', # convert column values after the fourth column into the rows of a new column called date
values_to = 'count')%>% # group the values (counts in this case, there were counts per date) into a column called confirmed
mutate(date = parse_date_time(date, "%m/%d/%y!*")) # function from lubridate package, convert to date objects
glimpse(confirmed_long)
## Observations: 17,920
## Variables: 6
## $ province_or_state <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ country_or_region <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afgha…
## $ Lat <dbl> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,…
## $ Long <dbl> 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,…
## $ date <dttm> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 20…
## $ count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# removing the NAs in the count column
dt <- data.table::data.table(confirmed_long)
na_removed <- na.omit(dt, cols = "count")
df <- data.frame(na_removed)
# create filtered dataset for top countries (look into selecting the top by sum(count) so that it is updated everytime?)
confirmed_country <- df%>%filter(country_or_region %in%
c("US", "China", "Iran", "Korea, South", "Italy", "Germany", "Spain"))%>%group_by(country_or_region, date)%>%
summarise(count = sum(count))
glimpse(confirmed_country)
## Observations: 490
## Variables: 3
## Groups: country_or_region [7]
## $ country_or_region <chr> "China", "China", "China", "China", "China", "China…
## $ date <dttm> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 20…
## $ count <dbl> 548, 643, 920, 1406, 2075, 2877, 5509, 6087, 8141, …
I decided to use the plotly library for this visualization. I wanted to do exploratory analysis to have a visual idea of the speed with which cases have been growing in different countries as compared to those countries who have managed to flatten the curve. I got the idea for this visualization after reading news articles about how the US’s curve seemed to follow that of Italy’s and not that of South Korea, and about how the South Korean government has been doing a good job to mitigate the crisis very early on.
# create hover text
tooltip <- with(confirmed_country, paste(country_or_region, '<br>', 'Confirmed Cases:', count, '<br>', 'Date:', date))
# create chart
plot_ly(
confirmed_country, y = ~country_or_region, x = ~count,
color = I("salmon"), textposition = "inside",
showlegend = FALSE, hovertemplate = tooltip,
frame = ~factor(date) # doesn't work with ~date, need to use factor()
) %>%
add_segments(
x = 100, y = ~country_or_region,
xend = ~count, yend = ~country_or_region) %>%
add_markers(
name = ~country_or_region, # I had an empty "" here bc didn't want "trace1" to show in tooltip, but it breaks
color = I("salmon"),
alpha = 0.7,
marker = list(size = 30)) %>%
add_text(
text = ~country_or_region,
color = I("gray30")) %>%
layout(
title = "Reported COVID-19 Cases by Country Over Time",
yaxis = list(title = "Country", standoff = 20),
xaxis = list(title = "Number of Confirmed Cases")
)%>%
animation_slider(
currentvalue = list(type = 'date', tickformat = "%d %B", prefix = "Date ")
)